| dada2Cleaning |
1 |
- output/s6TidyData/ASV_samplCounts.csv
- output/s6TidyData/ASV_taxanomy.csv
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67 | # setwd(dirname(parent.frame(2)$ofile))
pkgs <- c("Biostrings", "dplyr")
for (pkg in pkgs) {
library(pkg, character.only = T)
}
loger = function(msg){
t = strftime(Sys.time(), "%Y-%m-%d %H-%M-%S")
cat("[ ", t, " ] ", msg, "\n")
}
load(snakemake@input[[1]])
load(snakemake@input[[2]])
# Arrange rows of seqtab.nochim -------------------------------------------
loger("Arrange rows of seqtab.nochim")
orderIndex <- rownames(seqtab.nochim) %>%
as.integer() %>%
order
seqtab.nochim <- seqtab.nochim[orderIndex,]
# Combine reverse complement ASV counts -----------------------------------
loger("Combine reverse complement ASV counts")
duplicate <- intersect(dna, reverseComplement(dna))
index <- vector("integer")
rc_index <- vector("integer")
for (i in seq_along(duplicate)) {
if(sum(length(index), length(rc_index)) != length(duplicate)) {
if(!(i %in% rc_index)){
index = append(index, i)
j = which(reverseComplement(duplicate[i]) == duplicate)
rc_index = append(rc_index, j)
}
}
}
indexInRaw <- which(colnames(seqtab.nochim) %in% duplicate[index])
rc_indexInRaw <- which(colnames(seqtab.nochim) %in% duplicate[rc_index])
uniqSeqtab <- seqtab.nochim[,indexInRaw]+seqtab.nochim[,rc_indexInRaw]
uniqSeqtab <- cbind(uniqSeqtab, seqtab.nochim[,-append(indexInRaw, rc_indexInRaw)]) %>%
t()
seqs <- rownames(uniqSeqtab) %>%
DNAStringSet()
rownames(uniqSeqtab) <- paste0("ASV", 1:nrow(uniqSeqtab))
# Feature data (fdata) ----------------------------------------------------
loger("Clean up taxaid for feature data")
orderedTaxid <- taxid[as.character(seqs), 1:6]
# <Inspecting steps>
seqs_taxid <- rownames(orderedTaxid) %>%
DNAStringSet()
print("Does DNA seqs(rowname) of taxid match DNA seqs(rowname) of seqtab.nochim?")
print(identical(seqs, seqs_taxid)) # check the rows of fdata matches the rows of edata
# <inspecting steps end>
rownames(orderedTaxid) <- paste0("ASV", 1:nrow(orderedTaxid))
# Export to .csv ----------------------------------------------------------
loger("Export to .csv files")
write.csv(uniqSeqtab, file = "output/s6TidyData/ASV_samplCounts.csv")
write.csv(orderedTaxid, file = "output/s6TidyData/ASV_taxanomy.csv")
write.csv(seqs, file = "output/s6TidyData/ASV_dnaSeqs.csv")
save(seqs, file = "output/s6TidyData/ASV_dnaSeqs.rda")
|
|
| DADA2 |
1 |
- output/s5DADA2/img/ploterrF.png
- output/s5DADA2/img/ploterrR.png
- output/s5DADA2/DADA2_seqtab_nochim.rda
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109 | library(dada2)
library(dplyr)
library(ggplot2)
library(DECIPHER)
library(jsonlite)
packageVersion("dada2")
loger = function(msg){
t = strftime(Sys.time(), "%Y-%m-%d %H-%M-%S")
cat("[ ", t, " ] ", msg, "\n")
}
# --> Step1. Getting ready
loger("Getting ready")
path <- snakemake@params[[1]]
truncLens = read_json("output/truncLens.json")
# cutLens <- read.csv(snakemake@input[[1]])
# Read in fastq file names by r1 and r2
loger("Read in fastq file names")
fnfs = list.files(
path, pattern=".*.r1.fastq$", full.names = TRUE
) %>%
sort()
fnrs = list.files(
path, pattern = ".*.r2.fastq$", full.names = TRUE
) %>%
sort()
# Extract sample id"
# fastq file name format: {sample}.r1.fq
sampleID = fnfs %>%
basename() %>%
strsplit("\\.") %>%
sapply(`[`, 1)
# --> Step2. Inspect read quality profiles
# Done by fastq
# --> Step3. Filter and trim
# Place filtered files in filtered/ subdirectory
loger("Filter and trim")
filfs = file.path(path, "filtered", paste0("fasting_", sampleID, "_r1_filt.fq.gz"))
filrs = file.path(path, "filtered", paste0("fasting_", sampleID, "_r2_filt.fq.gz"))
names(filfs) = sampleID
names(filrs) = sampleID
out = filterAndTrim(
fnfs, filfs, fnrs, filrs,
truncLen=c(truncLens$r1, truncLens$r2),
maxN = 0, # DADA2 requires no Ns
maxEE=c(2, 2), # the maximum number of “expected errors” allowed in a read
truncQ=2,
compress=TRUE,
multithread=TRUE # On Windows set F
)
# removing samples that is empty after filtering
loger("Removing samples that is empty after filtering")
filfs_reduced = c()
filrs_reduced = c()
for(i in seq_along(filfs)){
if(!file.exists(filfs[i]) | !file.exists(filrs[i])){
next
}
filfs_reduced = c(filfs_reduced, filfs[i])
filrs_reduced = c(filrs_reduced, filrs[i])
}
filfs = filfs_reduced
filrs = filrs_reduced
# --> Step4. Learn the Error Rates
loger("Learn error rates")
errF = learnErrors(filfs, multithread = TRUE)
errR = learnErrors(filrs, multithread = TRUE)
loger("Generating plots of error rates")
ploterrF = plotErrors(errF, nominalQ = TRUE)
ploterrR = plotErrors(errR, nominalQ = TRUE)
ggsave("output/s5DADA2/img/ploterrF.png", plot = ploterrF, width = 8, height = 8, units = "in")
ggsave("output/s5DADA2/img/ploterrR.png", plot = ploterrR, width = 8, height = 8, units = "in")
# --> Step5. Sample Inference
loger("Sample inference, take some time...")
dadaFs = dada(filfs, err=errF, multithread=TRUE)
dadaRs = dada(filrs, err=errR, multithread=TRUE)
# --> Step6. Merge paired reads
loger("Merge paired reads")
mergers = mergePairs(dadaFs, filfs, dadaRs, filrs, verbose=TRUE)
# Inspect the merger df from the first sample
#head(mergers[[1]])
# --> Step7. Construct sequence table
loger("Construct seq table")
seqtab = makeSequenceTable(mergers)
# Inspect distribution of sequence lengths
loger("Inspect distribuion of sequence lengths")
seqtab %>%
getSequences() %>%
nchar() %>%
table()
# Sequences that are much longer or shorter than expected may be the result of non-specific priming.
# --> Step8. Remove chimeras
loger("Remove chimera")
seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
print("passing rate:")
print(sum(seqtab.nochim)/sum(seqtab))
save(seqtab.nochim, file = "output/s5DADA2/DADA2_seqtab_nochim.rda")
|
|
| taxonomy |
1 |
- output/s5DADA2/DADA2_taxaid.rda
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31 | library(dada2)
library(dplyr)
library(DECIPHER)
loger = function(msg){
t = strftime(Sys.time(), "%Y-%m-%d %H-%M-%S")
cat("[ ", t, " ] ", msg, "\n")
}
load(snakemake@input[[1]])
load(snakemake@input[[2]])
silva_species = snakemake@input[[3]]
# --> Step10. Assign taxonomy
loger("Assign taxonomy, take a long time...")
dna = seqtab.nochim %>%
getSequences %>%
DNAStringSet() # Create a DNAStringSet from ASVs
ids = IdTaxa(dna, trainingSet, strand="both", processors=NULL, verbose=FALSE) #use all processors
ranks = c("domain", "phylum", "class", "order", "family", "genus", "species")
# Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
taxid = t(sapply(ids, function(x){
m = match(ranks, x$rank)
taxa = x$taxon[m]
taxa[startsWith(taxa, "unclassified_")] <- NA
taxa
}))
colnames(taxid) = ranks
rownames(taxid) = getSequences(seqtab.nochim)
save(dna, taxid, ids, file = "output/s5DADA2/DADA2_taxaid.rda")
|
|
| fastQC |
1 |
- output/s4FastQC/r1_fastqc.html
- output/s4FastQC/r2_fastqc.html
- output/s4FastQC/r1_fastqc.zip
- output/s4FastQC/r2_fastqc.zip
- output/s4FastQC/r1.fastq.gz
- output/s4FastQC/r2.fastq.gz
|
|
|
| cat {input.r1}|gzip -c > output/s4FastQC/r1.fastq.gz
cat {input.r2}|gzip -c > output/s4FastQC/r2.fastq.gz
fastqc -o output/s4FastQC output/s4FastQC/r1.fastq.gz output/s4FastQC/r2.fastq.gz
|
|
| DownloadRefDB |
1 |
- database/SILVA_SSU.RData
- database/silva_species.fa.gz
|
|
|
| wget {params.taxa_url} --output-document=database/SILVA_SSU.RData -q
wget {params.species_url} --output-document=database/silva_species.fa.gz -q
|
|
| concatenate |
6 |
- output/s3Combine/1.r1.fastq
- output/s3Combine/1.r2.fastq
- output/s3Combine/2.r1.fastq
- output/s3Combine/2.r2.fastq
- output/s3Combine/3.r1.fastq
- output/s3Combine/3.r2.fastq
- output/s3Combine/4.r1.fastq
- output/s3Combine/4.r2.fastq
- output/s3Combine/5.r1.fastq
- output/s3Combine/5.r2.fastq
- output/s3Combine/6.r1.fastq
- output/s3Combine/6.r2.fastq
|
|
|
| # set -x
cat {input.first_r1} {input.second_r1} > {output.r1}
cat {input.first_r2} {input.second_r2} > {output.r2}
|
|
| CutAdapt |
6 |
- output/s2CutAdapt/first.1.r1.fastq
- output/s2CutAdapt/first.1.r2.fastq
- output/s2CutAdapt/second.1.r2.fastq
- output/s2CutAdapt/second.1.r1.fastq
- output/s2CutAdapt/first.2.r1.fastq
- output/s2CutAdapt/first.2.r2.fastq
- output/s2CutAdapt/second.2.r2.fastq
- output/s2CutAdapt/second.2.r1.fastq
- output/s2CutAdapt/first.3.r1.fastq
- output/s2CutAdapt/first.3.r2.fastq
- output/s2CutAdapt/second.3.r2.fastq
- output/s2CutAdapt/second.3.r1.fastq
- output/s2CutAdapt/first.4.r1.fastq
- output/s2CutAdapt/first.4.r2.fastq
- output/s2CutAdapt/second.4.r2.fastq
- output/s2CutAdapt/second.4.r1.fastq
- output/s2CutAdapt/first.5.r1.fastq
- output/s2CutAdapt/first.5.r2.fastq
- output/s2CutAdapt/second.5.r2.fastq
- output/s2CutAdapt/second.5.r1.fastq
- output/s2CutAdapt/first.6.r1.fastq
- output/s2CutAdapt/first.6.r2.fastq
- output/s2CutAdapt/second.6.r2.fastq
- output/s2CutAdapt/second.6.r1.fastq
|
|
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 | adapterFwd={params.barcode}{params.primer_fwd}
adapterRev={params.primer_rev}
cutadapt -e 0.15 \
-g $adapterFwd \
-G $adapterRev \
-o {output.first_r1} --discard-untrimmed \
-p {output.first_r2} --discard-untrimmed \
{input.first_r1} \
{input.first_r2} \
-j {threads}
cutadapt -e 0.15 \
-g $adapterFwd -G $adapterRev \
-o {output.second_r2} --discard-untrimmed \
-p {output.second_r1} --discard-untrimmed \
{input.second_r2} \
{input.second_r1} \
-j {threads}
|
|
| Demultiplex_fwd |
1 |
- output/s1Demultiplex/first.1.r1.fastq
- output/s1Demultiplex/first.2.r1.fastq
- output/s1Demultiplex/first.3.r1.fastq
- output/s1Demultiplex/first.4.r1.fastq
- output/s1Demultiplex/first.5.r1.fastq
- output/s1Demultiplex/first.6.r1.fastq
- output/s1Demultiplex/first.1.r2.fastq
- output/s1Demultiplex/first.2.r2.fastq
- output/s1Demultiplex/first.3.r2.fastq
- output/s1Demultiplex/first.4.r2.fastq
- output/s1Demultiplex/first.5.r2.fastq
- output/s1Demultiplex/first.6.r2.fastq
- output/s1Demultiplex/first.unmatched.r1.fastq
- output/s1Demultiplex/first.unmatched.r2.fastq
- output/s1Demultiplex/first.Sample.r1.fastq
- output/s1Demultiplex/first.Sample.r2.fastq
|
|
|
| fastq-multx -m 0 -x -b \
-B {input.barcode_file} \
{input.first_fqfile} \
-o {params.first_r1} \
-o {params.first_r2}
|
|
| Demultiplex_rev |
1 |
- output/s1Demultiplex/second.1.r1.fastq
- output/s1Demultiplex/second.2.r1.fastq
- output/s1Demultiplex/second.3.r1.fastq
- output/s1Demultiplex/second.4.r1.fastq
- output/s1Demultiplex/second.5.r1.fastq
- output/s1Demultiplex/second.6.r1.fastq
- output/s1Demultiplex/second.1.r2.fastq
- output/s1Demultiplex/second.2.r2.fastq
- output/s1Demultiplex/second.3.r2.fastq
- output/s1Demultiplex/second.4.r2.fastq
- output/s1Demultiplex/second.5.r2.fastq
- output/s1Demultiplex/second.6.r2.fastq
- output/s1Demultiplex/second.unmatched.r1.fastq
- output/s1Demultiplex/second.unmatched.r2.fastq
- output/s1Demultiplex/second.Sample.r1.fastq
- output/s1Demultiplex/second.Sample.r2.fastq
|
|
|
| fastq-multx -m 0 -x -b \
-B {input.barcode_file} \
{input.second_r2} \
{input.second_r1} \
-o {params.second_r2} \
-o {params.second_r1}
|
|